# PLAN:
# A) ISOLATION FOREST
# B) SAMPLE HOURLY OR 5 MIN
# single sensor
# bmp180
# temp
# 1.5 years
import warnings
warnings.filterwarnings('ignore')
import os
import dask
import dask.dataframe as dd
from dask.distributed import Client, progress
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
# import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly
# import plotly.plotly as py
import plotly.graph_objs as go
init_notebook_mode(connected=True)
%cd D:\CRITICAL_MAIN_DATAFILE__MILESTONE_II\AoT_Chicago.complete.2021-09-12
D:\CRITICAL_MAIN_DATAFILE__MILESTONE_II\AoT_Chicago.complete.2021-09-12
df = pd.read_csv('use_for_UML.csv', index_col = ['timestamp']) # 1.8M rows
df.head()
df.tail()
| sensor | value_hrf | T | |
|---|---|---|---|
| timestamp | |||
| 2018-05-01 00:00:18 | bmp180 | 30.2 | 2018-05-01 00:00:18 |
| 2018-05-01 00:00:44 | bmp180 | 30.3 | 2018-05-01 00:00:44 |
| 2018-05-01 00:01:10 | bmp180 | 30.4 | 2018-05-01 00:01:10 |
| 2018-05-01 00:01:35 | bmp180 | 30.3 | 2018-05-01 00:01:35 |
| 2018-05-01 00:02:01 | bmp180 | 30.4 | 2018-05-01 00:02:01 |
| sensor | value_hrf | T | |
|---|---|---|---|
| timestamp | |||
| 2019-10-31 23:57:55 | bmp180 | 2.5 | 2019-10-31 23:57:55 |
| 2019-10-31 23:58:20 | bmp180 | 2.5 | 2019-10-31 23:58:20 |
| 2019-10-31 23:58:45 | bmp180 | 2.5 | 2019-10-31 23:58:45 |
| 2019-10-31 23:59:10 | bmp180 | 2.6 | 2019-10-31 23:59:10 |
| 2019-10-31 23:59:35 | bmp180 | 2.6 | 2019-10-31 23:59:35 |
df.tail()
| sensor | value_hrf | T | |
|---|---|---|---|
| timestamp | |||
| 2019-10-31 23:57:55 | bmp180 | 2.5 | 2019-10-31 23:57:55 |
| 2019-10-31 23:58:20 | bmp180 | 2.5 | 2019-10-31 23:58:20 |
| 2019-10-31 23:58:45 | bmp180 | 2.5 | 2019-10-31 23:58:45 |
| 2019-10-31 23:59:10 | bmp180 | 2.6 | 2019-10-31 23:59:10 |
| 2019-10-31 23:59:35 | bmp180 | 2.6 | 2019-10-31 23:59:35 |
df.drop('sensor', axis=1, inplace=True)
df.drop('T', axis=1, inplace=True)
df.dtypes
value_hrf float64 dtype: object
# 1.8M rows of temp from a single sensor...
df.head(10)
| value_hrf | |
|---|---|
| timestamp | |
| 2018-05-01 00:00:18 | 30.2 |
| 2018-05-01 00:00:44 | 30.3 |
| 2018-05-01 00:01:10 | 30.4 |
| 2018-05-01 00:01:35 | 30.3 |
| 2018-05-01 00:02:01 | 30.4 |
| 2018-05-01 00:02:27 | 30.3 |
| 2018-05-01 00:02:52 | 30.3 |
| 2018-05-01 00:03:18 | 30.2 |
| 2018-05-01 00:03:44 | 30.2 |
| 2018-05-01 00:04:09 | 30.2 |
# # Using graph_objects
# from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
# import plotly
# import matplotlib.pyplot as plt
# from matplotlib import pyplot
# import plotly.graph_objs as go
# init_notebook_mode(connected=True)
# import plotly.graph_objs as go
# fig = go.Figure(data=[go.Scatter(x=df.index, y=df.value_hrf)])
# iplot(fig);
tdf = df.loc['2018-09-01':'2018-10-01'].copy()
del df # to be safe
len(tdf) # 1 229 022
87563
# from sklearn.ensemble import IsolationForest
# clf = IsolationForest(n_estimators=100,
# max_samples='auto',
# contamination=float(.01),
# max_features=1.0,
# bootstrap=False,
# n_jobs=-1,
# random_state=42,
# verbose=1)
# clf.fit(tdf[['value_hrf']]) # since 1D !
# # The predict function classifies the data as anomalies
# # based on the results from decision function on crossing a threshold
# tdf['scores']=clf.decision_function(tdf[['value_hrf']])
# #tdf['anomaly']=clf.predict(tdf[['value_hrf']])
# #tdf.loc[tdf['anomaly'] == 1,'anomaly'] = 0
# #tdf.loc[tdf['anomaly'] == -1,'anomaly'] = 1
# ## classified as -1 is 'anomalous'
# #tdf.anomaly.value_counts()
# pred = clf.predict(tdf[['value_hrf']])
# tdf['anomaly']=pred
# outliers=tdf.loc[tdf['anomaly']==-1]
# outlier_index=list(outliers.index)
# #Find the number of anomalies and normal points here points classified -1 are anomalous
# print(tdf['anomaly'].value_counts())
# IsolationForest(contamination=0.01, n_jobs=-1, random_state=42, verbose=1)
# 1 86706
# -1 857
# Name: anomaly, dtype: int64
# print("Percentage of anomalies in data: {:.2f}".format((len(tdf.loc[tdf['anomaly']==-1])/len(tdf))*100))
# import seaborn as sns
# import matplotlib.pyplot as plt
# %matplotlib inline
# # sns.set(style="darkgrid")
# fig, ax = plt.subplots(figsize=(15,10))
# sns.histplot(data=tdf, x="scores")
# plt.show();
# WORKS, BUT IS BIG FILE ! ! !
# dates = df['T']
# from plotly.offline import init_notebook_mode, iplot
# # Plot the actuals points
# Actuals = go.Scatter(name='Actuals',
# x=dates,
# y=df['value_hrf'],
# xaxis='x1',
# yaxis='y1',
# mode='line',
# # marker=dict(size=12,
# # line=dict(width=1),
# # color="gray"))
# fig = go.Figure(data=[Actuals])
# iplot(fig)
# # plt.show()
# IM PRETTY SURE THIS WORKS:
# # Using graph_objects
# from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
# import plotly
# import matplotlib.pyplot as plt
# from matplotlib import pyplot
# import plotly.graph_objs as go
# init_notebook_mode(connected=True) # need
# import plotly.graph_objs as go
# fig = go.Figure(data=[go.Scatter(x=df.index,
# y=df.value_hrf)])
# # to add it as true scatter, use mode='markers')
# iplot(fig)
## i think i have to have another column in the df that
## is timeframe, i can't use index
tdf['T'] = tdf.index
tdf['T'] = tdf['T'].astype('datetime64[ns]')
tdf.head()
| value_hrf | T | |
|---|---|---|
| timestamp | ||
| 2018-09-01 00:00:14 | 36.5 | 2018-09-01 00:00:14 |
| 2018-09-01 00:00:39 | 36.5 | 2018-09-01 00:00:39 |
| 2018-09-01 00:01:04 | 36.5 | 2018-09-01 00:01:04 |
| 2018-09-01 00:01:29 | 36.5 | 2018-09-01 00:01:29 |
| 2018-09-01 00:01:55 | 36.5 | 2018-09-01 00:01:55 |
from sklearn.ensemble import IsolationForest
clf = IsolationForest(n_estimators=100,
# the number of base estimators in the ensemble
max_samples='auto',
# base assumption, critical to output
contamination=float(.011),
# If ‘auto’, the threshold is determined as in the original paper.
max_features=1.0,
bootstrap=False,
# If True, individual trees are fit on random subsets of the training
# data sampled with replacement. If False, sampling without
# replacement is performed.
n_jobs=-1,
# -1 means using all processors.
random_state=42,
verbose=1,
)
clf.fit(tdf[['value_hrf']]) # make sure 2D-array, remember...
[Parallel(n_jobs=24)]: Using backend ThreadingBackend with 24 concurrent workers. [Parallel(n_jobs=24)]: Done 2 out of 24 | elapsed: 0.2s remaining: 2.8s [Parallel(n_jobs=24)]: Done 24 out of 24 | elapsed: 0.2s finished
IsolationForest(contamination=0.011, n_jobs=-1, random_state=42, verbose=1)
# add scores to the original df
tdf['scores']=clf.decision_function(tdf[['value_hrf']])
# Average anomaly score of X of the base classifiers
# The anomaly score of an input sample is computed as the
# mean anomaly score of the trees in the forest.
# The measure of normality of an observation given a tree
# is the depth of the leaf containing this observation,
# which is equivalent to the number of splittings required
# to isolate this point. In case of several observations
# n_left in the leaf, the average path length of a n_left
# samples isolation tree is added.
tdf['anomaly']=clf.predict(tdf[['value_hrf']])
# This should be score of 1 or -1
# Here, we can see that both the anomalies are assigned an anomaly score of -1.
# Predict if a particular sample is an outlier or not.
# For each observation, tells whether or not (+1 or -1) it should
# be considered as an inlier according to the fitted model.
tdf.loc[tdf['anomaly'] == 1,'anomaly'] = 0
tdf.loc[tdf['anomaly'] == -1,'anomaly'] = 1
# makes counting easier
tdf.anomaly.value_counts()
0 86637 1 926 Name: anomaly, dtype: int64
# data['scores'] = model.decision_function(data[['marks']])
# data['anomaly_score'] = model.predict(data[['marks']])
# data[data['anomaly_score']==-1].head()
tdf.scores.head() # see, its a value shown example
timestamp 2018-09-01 00:00:14 0.167866 2018-09-01 00:00:39 0.167866 2018-09-01 00:01:04 0.167866 2018-09-01 00:01:29 0.167866 2018-09-01 00:01:55 0.167866 Name: scores, dtype: float64
clf.get_params()
{'bootstrap': False,
'contamination': 0.011,
'max_features': 1.0,
'max_samples': 'auto',
'n_estimators': 100,
'n_jobs': -1,
'random_state': 42,
'verbose': 1,
'warm_start': False}
import seaborn as sns
sns.displot(tdf, x="scores");
# outliers 0 or below, but higher the score, more likely to be an inliner
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
# sns.set(style="darkgrid")
fig, ax = plt.subplots(figsize=(15,10))
sns.histplot(data=tdf, x="scores")
plt.show();
dates = tdf['T']
# WORKING
from plotly.offline import init_notebook_mode, iplot
# Plot the actuals points
original_data = go.Scatter(name='Actuals',
x=dates,
y=tdf['value_hrf'],
xaxis='x1',
yaxis='y1',
# mode = ?
marker=dict(size=5,
line=dict(width=1),
color="gray"))
fig = go.Figure(data=[original_data])
fig = fig.update_layout(title="", template="plotly_dark")
fig
# iplot(fig)
# this is just actuals, you could clean up to look prettier if you wanted ! ! !
# WORKING:
bool_array = (abs(tdf['anomaly']) > 0)
actuals = tdf["value_hrf"][-len(bool_array):]
anomaly_pts = bool_array * actuals
anomaly_pts[anomaly_pts == 0] = np.nan
anomalies_red = go.Scatter(name="Anomaly",
showlegend=True,
x=dates,
y=anomaly_pts,
mode='markers',
xaxis='x1',
yaxis='y1',
marker=dict(color="red",
size=1,
opacity=0.6,
line=dict(
color="red",
width=1)))
layout = dict(width=1000,
height=865,
autosize=False,
title="demo",
margin=dict(t=75),
showlegend=True)
fig = go.Figure(data=[anomalies_red], layout=layout)
fig = fig.update_layout(title="", template="plotly_dark")
iplot(fig)
# JUST SHOWING ANOMALIES IF YOU WANTED...
fig = go.Figure(data=[original_data, anomalies_red], layout=layout)
fig = fig.update_layout(title="", template="plotly_dark")
iplot(fig)
# when it is safe, you can kill this off ...
# from sklearn.ensemble import IsolationForest
# clf = IsolationForest(n_estimators=100,
# max_samples='auto',
# contamination=float(.01),
# max_features=1.0,
# bootstrap=False,
# n_jobs=-1,
# random_state=42,
# verbose=1)
# clf.fit(tdf[['value_hrf']]) # since 1D !
# # The predict function classifies the data as anomalies
# # based on the results from decision function on crossing a threshold
# tdf['score']=clf.decision_function(tdf[['value_hrf']])
# #tdf['anomaly']=clf.predict(tdf[['value_hrf']])
# #tdf.loc[tdf['anomaly'] == 1,'anomaly'] = 0
# #tdf.loc[tdf['anomaly'] == -1,'anomaly'] = 1
# ## classified as -1 is 'anomalous'
# #tdf.anomaly.value_counts()
# pred = clf.predict(tdf[['value_hrf']])
# tdf['anomaly']=pred
# outliers=tdf.loc[tdf['anomaly']==-1]
# outlier_index=list(outliers.index)
# #Find the number of anomalies and normal points here points classified -1 are anomalous
# # print(tdf['anomaly'].value_counts())
# # test_df['actuals']=metrics_df.iloc[:,i:i+1]
# #Get the indexes of outliers in order to compare the metrics with use case anomalies if required
# plot_anomaly(tdf, tdf.value_hrf)
# # specify the single metrics column name to be modelled
# # you COULD do this for the entire sensor ! ! !
# to_model_columns='value_hrf'
# from sklearn.ensemble import IsolationForest
# clf=IsolationForest(n_estimators=100,
# max_samples='auto',
# max_features=1.0,
# contamination=float(.01),
# bootstrap=False,
# n_jobs=-1,
# random_state=42,
# verbose=1)
# clf.fit(df[[to_model_columns]])
# pred = clf.predict(df[[to_model_columns]])
# df['anomaly']=pred
# outliers=df.loc[df['anomaly']==-1]
# outlier_index=list(outliers.index)
# # print(outlier_index)
# # Find the number of anomalies and normal points here points classified -1 are anomalous
# print(df['anomaly'].value_counts())
# df.drop(['anomaly'], axis=1, inplace=True)
Isolation forests are an unsupervised extension of the popular random forest algorithm. The building blocks of isolation forests are isolation trees with a binary outcome (is/is not an outlier).
When an isolation forest is built, the algorithm splits each individual data point off from all other data points. The easier it is to isolate a single point in space from all other points, the more likely it is an outlier (because it’s far away from all other data points). If a data point is an in-lier, it will be closely surrounded by other data points, and will take more splits to isolate (1). See the graphic below as an illustration.
# # Using graph_objects
# from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
# import plotly
# import matplotlib.pyplot as plt
# from matplotlib import pyplot
# import plotly.graph_objs as go
# init_notebook_mode(connected=True)
# import plotly.graph_objs as go
# i think this is working...
# it is huge i think...
# import pandas as pd
# import plotly.express as px
# fig = px.scatter(df, x='T',
# y='value_hrf',
# # color='output',
# hover_data=['T'])
# fig.update_layout(
# autosize=False,
# width=1000,
# height=800,
# margin=dict(
# l=10,
# r=10,
# b=10,
# t=10,
# pad=4
# ),
# paper_bgcolor="white",
# )
# fig.update_traces(marker={'size': 1})
# fig.show()
# import pandas as pd
# import plotly.express as px
# fig = px.scatter(df, x='T',
# y='value_hrf',
# # color='output',
# hover_data=['T'])
# fig.update_layout(
# autosize=False,
# width=800,
# height=500,
# margin=dict(
# l=10,
# r=10,
# b=10,
# t=10,
# pad=2
# ),
# paper_bgcolor="white",
# )
# fig.update_traces(marker={'size': 2})
# fig.show()
# fig.write_html("high_resolution_2.html")
# import pandas as pd
# import plotly.express as px
# # fig = px.scatter(df, x='T',
# # y='value_hrf',
# # # color='output',
# # hover_data=['T'])
# # fig.update_layout(
# # autosize=False,
# # width=800,
# # height=500,
# # margin=dict(
# # l=10,
# # r=10,
# # b=10,
# # t=10,
# # pad=2
# # ),
# # paper_bgcolor="white",
# # )
# # fig.update_traces(marker={'size': 2})
# # fig.show()
# import plotly.graph_objects as go
# fig = go.Figure(data=go.Scattergl(
# x = df.T,
# y = df.value_hrf,
# mode='markers',
# ))
# fig.show()
# fig.write_html("high_resolution_1.html")
# WORKS, you can uncomment out if you wanted ...
# # df=full_df.loc[(full_df['timestamp']>'2014-02-17 00:00:00')&(full_df['timestamp']<'2014-02-17 23:59:59')]
# # Using graph_objects
# import plotly.graph_objects as go
# plot_data=go.Scatter(x=df['T'], y=df['value_hrf'])
# fig=go.Figure(data=[plot_data])
# fig.update_layout(
# autosize=False,
# width=1000,
# height=800,
# margin=dict(
# l=10,
# r=10,
# b=10,
# t=10,
# pad=4
# ),
# paper_bgcolor="white",
# )
# # fig.show()
# iplot(fig)
# fig.write_html("high_resolution_1.html")
# def isolation_forest_anomaly_detection(df,
# column_name,
# outliers_fraction):
# """
# In this definition, time series anomalies are detected using an Isolation Forest algorithm.
# Arguments:
# df: Pandas dataframe
# column_name: string. Name of the column that we want to detect anomalies in
# outliers_fraction: float. Percentage of outliers allowed in the sequence.
# Outputs:
# df: Pandas dataframe with column for detected Isolation Forest anomalies (True/False)
# """
# #Scale the column that we want to flag for anomalies
# min_max_scaler = preprocessing.StandardScaler()
# np_scaled = min_max_scaler.fit_transform(df[[column_name]])
# scaled_time_series = pd.DataFrame(np_scaled)
# # train isolation forest
# model = IsolationForest(contamination = outliers_fraction,
# behaviour='new')
# model.fit(scaled_time_series)
# #Generate column for Isolation Forest-detected anomalies
# isolation_forest_anomaly_column = column_name+'_Isolation_Forest_Anomaly'
# df[isolation_forest_anomaly_column] = model.predict(scaled_time_series)
# df[isolation_forest_anomaly_column] = df[isolation_forest_anomaly_column].map( {1: False, -1: True} )
# return df
# ## EXECUTE IN MAIN BLOCK
# # APPLY ISOLATION FOREST TO DETECT ANOMALIES
# df=isolation_forest_anomaly_detection(df,
# column_name='value_hrf',
# outliers_fraction=.01)
# # Re-plot time series with color coding for anomaly column
# scatterplot_with_color_coding(gasoline_price_df['Date'],
# gasoline_price_df['Gasoline_Price'],
# gasoline_price_df['Gasoline_Price_Isolation_Forest_Anomaly'],
# 'Date',
# 'Gasoline Price (Dollars Per Gallon)',
# 'Gasoline Prices, Color-Coded on Isolation Forest Anomalies')
# from statsmodels.tsa.seasonal import seasonal_decompose
# seasonal_decompose(df.value_hrf, model='additive', freq=1).plot()
# seasonal_decompose(df.value_hrf, model='additive', period=365).plot()
# from matplotlib import pyplot as plt
# result = seasonal_decompose(df.value_hrf, model='additive', period=1)
# # plt.figure(figsize=(1,1))
# result.plot()
# plt.show();
# single_year = temp_df.copy()
# single_year = single_year[single_year['sensor']=='bmp180']
# single_year = single_year.loc['2019-01-01':'2019-12-31']
# import seaborn as sns
# plt.figure(figsize = (19,20))
# sns.lineplot(y = single_year.value_hrf.values,
# x = single_year.index.values);
# # type(single_year['value_hrf'].head(10))
# single_year['value_hrf'] = pd.to_numeric(single_year['value_hrf'])
# from pandas import read_csv
# from matplotlib import pyplot
# single_year['value_hrf'].plot()
# plt.show();
# # single_year['value_hrf'].plot(linewidth=0.5);
# # import seaborn as sns
# # plt.figure(figsize = (19,20))
# # sns.lineplot(y = single_year.value_hrf.values,
# # x = single_year.index.values);
# # Using plotly.express
# import plotly.express as px
# fig = px.line(single_year, x='T', y="value_hrf")
# fig.show()
# diff_year = temp_df.copy()
# diff_year = diff_year[diff_year['sensor']=='bmp180']
# diff_year = diff_year.loc['2018-09-01':'2019-09-01']
# # Using plotly.express
# import plotly.express as px
# fig = px.line(diff_year, x='T', y="value_hrf")
# fig.show()
# import plotly.express as px
# df = px.data.stocks(indexed=True)-1
# fig = px.area(df, facet_col="company", facet_col_wrap=2)
# fig.show()
# import dash
# import dash_core_components as dcc
# import dash_html_components as html
# from dash.dependencies import Input, Output
# import plotly.express as px
# df = px.data.stocks()
# app = dash.Dash(__name__)
# app.layout = html.Div([
# dcc.Dropdown(
# id="ticker",
# options=[{"label": x, "value": x}
# for x in df.columns[1:]],
# value=df.columns[1],
# clearable=False,
# ),
# dcc.Graph(id="time-series-chart"),
# ])
# @app.callback(
# Output("time-series-chart", "figure"),
# [Input("ticker", "value")])
# def display_time_series(ticker):
# fig = px.line(df, x='date', y=ticker)
# return fig
# app.run_server(debug=True)
# import plotly.express as px
# df = px.data.stocks(indexed=True)-1
# fig = px.area(df, facet_col="company", facet_col_wrap=2)
# fig.show()
# # Using plotly.express
# import plotly.express as px
# df = px.data.stocks()
# fig = px.line(df, x='date', y="GOOG")
# fig.show()
# import plotly.express as px
# df = px.data.stocks(indexed=True)-1
# fig = px.area(df, facet_col="company", facet_col_wrap=2)
# fig.show()
# import plotly.express as px
# df = px.data.stocks(indexed=True)-1
# fig = px.area(df, facet_col="company", facet_col_wrap=2)
# fig.show()
# sns.lineplot(x='T', y='value_hrf', hue='sensor', data=temp_df)
# axes = single_year['value_hrf'].plot(marker='.',
# alpha=0.5,
# linestyle='None',
# figsize=(11, 9),
# subplots=True)
# for ax in axes:
# ax.set_ylabel('Daily Totals (GWh)')
# plt.figure(figsize = (19,8))
# sns.lineplot(y = temp_df.loc['2020-01-01':'2020-12-31'].value_hrf,
# x = time_series_df.loc['2020-01-01':'2020-12-31'].index.values);
# plt.figure(figsize = (19,8))
# sns.lineplot(y = temp_df.loc['2020-01-01':'2020-12-31'].value_hrf,
# x = time_series_df.loc['2020-01-01':'2020-12-31'].index.values);
# you can filter as deep as you want fyi:
# tempo = df[ (df['node_id']=='001e0610ee36') & (df['sensor']=='hih6130') & (df['parameter']=='temperature') & (df['subsystem']=='lightsense') ]
# if you 'compute' this, it will then be a pandas df, and then you can export...
# import matplotlib.pyplot as plt
# import seaborn as sns
# sns.set_theme(style="darkgrid")
# x = time_series_df.index[20000:20050]
# # loc['2020-06-01':'2021-02-14']
# y = time_series_df[:500].values
# sns.lineplot(x,y)
# plt.show();
# plt.figure(figsize = (19,8))
# sns.lineplot(y = time_series_df.loc['2020-09-18':'2021-02-13'].values ,
# x = time_series_df.loc['2020-09-18':'2021-02-13'].index.values);